Compare core genome mapping method to 16S rRNA gene amplicon sequencing variants of Gardnerella

Set up

Packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.5     ✔ dplyr   1.0.7
## ✔ tidyr   1.1.4     ✔ stringr 1.4.0
## ✔ readr   2.0.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(phyloseq)
library(Biostrings)
library(formattable)
library(ggpubr)
library(cowplot)
`%!in%` <- negate(`%in%`)

Paths

repoDir <- ".."
dataDir <- "../../"
figureOut <- "../../FIGURES/submission_manuscript_figures"

#shotgun metadata
sgDF <- file.path(repoDir, "sumgsDF.tsv") %>%
  read_tsv %>%
  mutate(SampleID=as.character(SampleID),
         SubjectID=as.character(SubjectID),
         pctHuman=((Sickle_pairs-bbmapFiltered_pairs)/Sickle_pairs)*100) # calculate percent human reads

#ASVs from Callahan et al., 2017 PNAS Replication and Refinement
S16_path <- file.path(dataDir, "16S_data/RepRefine_Scripts/input/processed.rda")
load(S16_path)

#Gardnerella 16S fasta sequences from reference WGS strains
rRNAs16S <- file.path(dataDir, "gardPhylogeny/prokka_annotated_genomes/ffn") %>%
  list.files(full.names = TRUE, pattern = "ffn") %>%
  map(readDNAStringSet) %>%
  do.call("c", .) %>%
  .[str_detect(names(.), "16S ribosomal RNA")]

#gardnerella metadata
refDF <- file.path(repoDir, "refGardnerellaCladesGenomos.csv") %>%
  read_csv

#Gardnerella clade abundance from cladeAssignments.Rmd
sgCladeDF <- file.path(repoDir, "gardCladeRelativeAbundance.tsv") %>%
  read_tsv() %>%
  mutate(SampleID=as.character(SampleID))

#Gardnerella genomospecies abundance from cladeAssignments.Rmd
sgGenomoDF <- file.path(repoDir, "gardGenomospeciesRelativeAbundance.tsv") %>%
  read_tsv() %>%
  mutate(SampleID=as.character(SampleID))

ggplot themes

theme_set(theme_bw())
# variant colors for clades vs V4
ggplotCladeV4 <- list(scale_size_manual(values=c(1.5, 3)),
  scale_color_manual(values=c("salmon", "indianred", "lightblue", "blue3", "darkorchid2", "yellowgreen", "red", "blue1", "grey48", "darkorchid2", "yellowgreen"), labels=c("Clade C1 (shotgun)", "Clade C2 (shotgun)", "Clade C3 (shotgun)", "Clade C4 (shotgun)", "Clade C5 (shotgun)", "Clade C6 (shotgun)", "G2: Clades 1/2 (V4 amplicon)", "G1: Clades 3/4 (V4 amplicon)", "G3: indeterminant (V4 amplicon)", "G4: Clade 5 (V4 amplicon)", "G5: Clade 6 (V4 amplicon)")),
  scale_shape_manual(values = c(16, 1)))

# clade, genomospecies, and V4 variant colors
ggplotAll <- list(scale_size_manual(values=c(1.5, 2, 3)),
  scale_color_manual(values=c("salmon", "indianred", "lightblue", "blue3", "darkorchid2", "yellowgreen",
                              "salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "blue4", "blue", "lightblue1", "lightblue2", "lightblue3", "lightblue4",   "darkorchid2", "yellowgreen", "cadetblue1",
                              "red", "blue1", "grey48", "darkorchid2", "yellowgreen"),
                     labels=c("Clade C1 (shotgun)", "Clade C2 (shotgun)", "Clade C3 (shotgun)", "Clade C4 (shotgun)", "Clade C5 (shotgun)", "Clade C6 (shotgun)",
                              "Genomospecies 1, G. vaginalis (shotgun)",
                              "Genomospecies 2 (shotgun)",
                              "Genomospecies 3 (shotgun)",
                              "Genomospecies 4, G. piotti (shotgun)",
                              "Genomospecies 11 (shotgun)",
                              "Genomospecies 5, G. leopoldii (shotgun)",
                              "Genomospecies 6, G. swidsinskii (shotgun)",
                              "Genomospecies 7 (shotgun)",
                              "Genomospecies 8 (shotgun)",
                              "Genomospecies 9 (shotgun)",
                              "Genomospecies 10 (shotgun)",
                              "Genomospecies 12 (shotgun)",
                              "Genomospecies 13 (shotgun)",
                              "Genomospecies 14 (shotgun)",
                              "G2: Clades 1/2 (V4 amplicon)", "G1: Clades 3/4 (V4 amplicon)", "G3: indetereminant (V4 amplicon)", "G4: Clade 5 (V4 amplicon)", "G5: Clade 6 (V4 amplicon)")),
  scale_shape_manual(values = c(16, 5, 1)))  

# clade and genomospecies variant colors  
ggplotShotgunOnly <- list(scale_size_manual(values=c(1.5, 2)),
  scale_color_manual(values=c("salmon", "indianred", "lightblue", "blue3", "darkorchid2", "yellowgreen",
                              "salmon1", "salmon3", "indianred1", "indianred2", "indianred3", "blue4", "blue", "lightblue1", "lightblue2", "lightblue3", "lightblue4",   "darkorchid2", "yellowgreen", "cadetblue1"),
                     labels=c("Clade C1", "Clade C2", "Clade C3", "Clade C4", "Clade C5", "Clade C6",
                              "Genomospecies 1, G. vaginalis",
                              "Genomospecies 2",
                              "Genomospecies 3",
                              "Genomospecies 4, G. piotti",
                              "Genomospecies 11",
                              "Genomospecies 5, G. leopoldii",
                              "Genomospecies 6, G. swidsinskii",
                              "Genomospecies 7",
                              "Genomospecies 8",
                              "Genomospecies 9",
                              "Genomospecies 10",
                              "Genomospecies 12",
                              "Genomospecies 13",
                              "Genomospecies 14")),
  scale_shape_manual(values = c(16, 5)))

Set up abundance table from MetaPhlAn2 results

#metaphlan data
mDF0 <- file.path(dataDir, "/humann2/merged_tables/mergedMetaphlanAbundance.tsv") %>%
  read_tsv() %>%
  filter(str_detect(ID, "s__"), !str_detect(ID, "t__")) %>%
  mutate(Species = str_extract(ID, "(?<=s__)\\w+$")) %>%
  select(Species, everything(), -ID) %>%
  as.data.frame()

#make col and row names
samps <- colnames(mDF0[2:ncol(mDF0)]) %>%
  str_extract("[0-9]{10}")
samps
##   [1] "1000801248" "1000801318" "1000801368" "1001301158" "1001301248"
##   [6] "1001301318" "1001801138" "1001801248" "1001801318" "1002701138"
##  [11] "1002701158" "1002701278" "1003101118" "1003101188" "1003101278"
##  [16] "1004001168" "1004001278" "1004001348" "1005601098" "1005601168"
##  [21] "1005601348" "1050801308" "1050801318" "1050835178" "1050835218"
##  [26] "1060435158" "1060435318" "1060435348" "1060801198" "1060801338"
##  [31] "1060835148" "1061235118" "1061235188" "1061235278" "1061635088"
##  [36] "1061635208" "1061635368" "1062101108" "1062101238" "1062101338"
##  [41] "1062301128" "1062301198" "1062335178" "1062601228" "1062601258"
##  [46] "1062601368" "1062701128" "1062701298" "1062735148" "1062735198"
##  [51] "1062901138" "1062901218" "1062901318" "1900401198" "1900401288"
##  [56] "1900401398" "1902401118" "1902401208" "1902401308" "2050501168"
##  [61] "2050501298" "2050501348" "4002101198" "4002101298" "4002135018"
##  [66] "4003235018" "4003235288" "4003235368" "4004235208" "4004235248"
##  [71] "4004235268" "4004735328" "4004735368" "4004935158" "4004935268"
##  [76] "4004935338" "4005935278" "4005935328" "4005935358" "4006635198"
##  [81] "4006635258" "4006635278" "4006635338" "4006935208" "4006935248"
##  [86] "4006935358" "4007135108" "4007135118" "4007135148" "4007135288"
##  [91] "4007235168" "4007235278" "4007435168" "4007435248" "4007435258"
##  [96] "4007535198" "4007535238" "4007535358" "4008435158" "4008435348"
## [101] "4008435358" "4009035168" "4009035268" "4009035368" "4009835178"
## [106] "4009835228" "4009835268"
colnames(mDF0) <- c("Species", samps)
rownames(mDF0) <- mDF0$Species

#Final edits to table
mDF <- mDF0 %>%
  select(-Species) %>%
  t() %>%
  as_tibble() %>%
  mutate_all(funs(./100)) %>%
  mutate(SampleID=samps) %>% 
  select(SampleID, everything())
mDF[1:6,1:5]
## # A tibble: 6 × 5
##   SampleID   Actinobaculum_m… Actinobaculum_s… Actinobaculum_u… Actinomyces_eur…
##   <chr>                 <dbl>            <dbl>            <dbl>            <dbl>
## 1 1000801248                0                0                0                0
## 2 1000801318                0                0                0                0
## 3 1000801368                0                0                0                0
## 4 1001301158                0                0                0                0
## 5 1001301248                0                0                0                0
## 6 1001301318                0                0                0                0

Organize method abundances

Get V4 variants

Get V4 Variants that appear in these samples and determine corresponding clades using 16S rRNA sequences from

# all sample IDs that would have corresponding ASV data
studyIDs <- sgDF %>%
  filter(PaperCohort=="2017",
         RNAlater=="No") %>%
  .$SampleID

# phyloseq object of data from Callahan et al., 2017 PNAS  Replication and Refinement
PS17 <- phyloseq(otu_table(st, taxa_are_rows=FALSE), sample_data(df), tax_table(tax))
  #transform_sample_counts(function(x) {x/sum(x)}) #make read counts into relative abundances

has.shotgun <- PS17@sam_data$SampleID %in% studyIDs
PS16S_170 <- PS17 %>%
  subset_samples(has.shotgun) #keep only samples of interest in OTU table

S16gard <- PS16S_170 %>%
  psmelt %>% 
  filter(Genus=="Gardnerella", 
         Abundance>0)

Note: there are 9 variants among the entire sample set for 2017, but only 5 within the shotgun samples samples

Are there any samples in which Gardnerella was not found by 16S?

noGard16S <- setdiff(studyIDs, unique(S16gard$SampleID))
noGard16S
## [1] "1062301128" "1062301198"
PS16S_170 %>%
  psmelt %>%
  filter(Genus=="Gardnerella",
         SampleID %in% noGard16S) %>%
  select(SampleID, Abundance) %>%
  unique
##     SampleID Abundance
## 1 1062301128         0
## 2 1062301198         0

Two from subject 10623. Was Gardnerella found by MetaPhlAn2?

mDF %>%
  filter(SampleID %in% noGard16S) %>%
  select(SampleID, Gardnerella_vaginalis)
## # A tibble: 2 × 2
##   SampleID   Gardnerella_vaginalis
##   <chr>                      <dbl>
## 1 1062301128                     0
## 2 1062301198                     0

Gardnerella was not found by MetaPhlAn2 or 16S in these two samples.

Map variants to clades

Determine which V4 variants are in each reference WGS isolate

# get list of ASVs from 16S amplicon data
vars <- S16gard %>%
  select(OTU) %>%
  unique %>%
  mutate(var=paste0("G", c(1:nrow(.)))) %>%
  dplyr::rename(seq="OTU") %>%
  select(var, seq)

# Determine which clades each V4 ASV maps to in Gardnerella WGS reference sequences
varClades <- rRNAs16S %>%
  as.data.frame() %>%
  rownames_to_column("Strain") %>%
  dplyr::rename(seq=x) %>%
  mutate(Strain=str_extract(Strain, ".*(?=_[0-9]{5} 16S ribosomal RNA$)"),
         seq=str_extract(seq, paste(vars$seq, collapse = "|"))) %>%
  filter(!is.na(seq)) %>%
  left_join(refDF, by="Strain") %>%
  left_join(vars, by="seq") %>%
  select(var, Clade) %>%
  unique
  
varClades
##    var Clade
## 1   G1    C2
## 3   G2    C3
## 4   G1    C1
## 6   G3    C1
## 10  G2    C4
## 21  G4    C5
## 27  G3    C2
## 34  G5    C6
## 48  G3    C3

NOTE: THESE LABELS DO NOT MATCH Callahan 2017 G1/G2 ARE FLIPPED

Organize sample abundances

Get abundances of variants in each sample that has 16S and shotgun sequencing and format tables

vars0 <- vars %>%
  spread(var, seq)

S16gardDF <- S16gard %>%
  mutate(Var=case_when(OTU==vars0$G1~"G2: Clades 1/2", #change G1/G2 to match Callahan 2017 
                       OTU==vars0$G2~"G1: Clades 3/4",
                       OTU==vars0$G3~"G3: indetereminant",
                       OTU==vars0$G4~"G4: Clade 5",
                       OTU==vars0$G5~"G5: Clade 6")) %>%
  group_by(SampleID) %>%
  mutate(propGard=Abundance/sum(Abundance)) %>%
  ungroup() %>%
  select(OTU, SampleID, SubjectID, Abundance, propGard, GWcoll, Var) #RNAlater

S16gardDF$Var <- factor(S16gardDF$Var, levels = c("G2: Clades 1/2", "G1: Clades 3/4", "G3: indetereminant", "G4: Clade 5", "G5: Clade 6")) #Keep order G2, G1, G3, G4, G5

Organize dataframes

Place clade, genomospecies, and V4 variant abundances into one dataframe

S16gardDF0 <- S16gardDF %>%
  select(SampleID, Var, propGard) %>%
  mutate(Method="V4 region 16S amplicon")

sgCladeDF0 <- sgCladeDF %>%
  filter(SampleID %in% studyIDs) %>%
  dplyr::rename(Var=Clade,
                propGard=propClade) %>%
  mutate(Var=case_when(Var=="C1"~"Clade 1",
                       Var=="C2"~"Clade 2",
                       Var=="C3"~"Clade 3",
                       Var=="C4"~"Clade 4",
                       Var=="C5"~"Clade 5",
                       Var=="C6"~"Clade 6"),
         Method="Shotgun Clade")

sgGenomoDF0 <- sgGenomoDF %>%
  filter(SampleID %in% studyIDs) %>%
  dplyr::rename(Var=Genomospecies,
                propGard=propGenomospecies) %>%
  mutate(Var=as.character(Var),
          Var=case_when(Var=="GS1"~"Genomospecies 1, G. vaginalis",
                        Var=="GS2"~"Genomospecies 2",
                        Var=="GS3"~"Genomospecies 3",
                        Var=="GS4"~"Genomospecies 4, G. piotti",
                        Var=="GS5"~"Genomospecies 5, G. leopoldii",
                        Var=="GS6"~"Genomospecies 6, G. swidsinskii",
                        Var=="GS7"~"Genomospecies 7",
                        Var=="GS8"~"Genomospecies 8",
                        Var=="GS9"~"Genomospecies 9",
                        Var=="GS10"~"Genomospecies 10",
                        Var=="GS11"~"Genomospecies 11",
                        Var=="GS12"~"Genomospecies 12",
                        Var=="GS13"~"Genomospecies 13", 
                        Var=="GS14"~"Genomospecies 14"),
         Method="Shotgun Genomospecies")

gardAllVarDF <- S16gardDF0 %>%
  full_join(sgCladeDF0, by=c("SampleID", "Var", "propGard", "Method")) %>%
  full_join(sgGenomoDF0) %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "GWcoll")], by="SampleID") %>%
  mutate(Var=factor(Var, levels = c("Clade 1", "Clade 2", "Clade 3", "Clade 4", "Clade 5", "Clade 6",
                                     "Genomospecies 1, G. vaginalis", "Genomospecies 2", "Genomospecies 3", "Genomospecies 4, G. piotti", "Genomospecies 11", "Genomospecies 5, G. leopoldii", "Genomospecies 6, G. swidsinskii", "Genomospecies 7", "Genomospecies 8", "Genomospecies 9", "Genomospecies 10",  "Genomospecies 12", "Genomospecies 13", "Genomospecies 14",
                                     "G2: Clades 1/2", "G1: Clades 3/4", "G3: indetereminant", "G4: Clade 5", "G5: Clade 6")))

head(gardAllVarDF)
## # A tibble: 6 × 7
##   SampleID   Var            propGard Method                   n SubjectID GWcoll
##   <chr>      <fct>             <dbl> <chr>                <dbl> <chr>      <dbl>
## 1 1062101108 G2: Clades 1/2    1     V4 region 16S ampli…    NA 10621       10  
## 2 1062101338 G2: Clades 1/2    1     V4 region 16S ampli…    NA 10621       33  
## 3 1062101238 G2: Clades 1/2    1     V4 region 16S ampli…    NA 10621       23.1
## 4 1902401208 G1: Clades 3/4    0.923 V4 region 16S ampli…    NA 19024       20.3
## 5 1902401308 G1: Clades 3/4    0.816 V4 region 16S ampli…    NA 19024       30.4
## 6 1005601168 G2: Clades 1/2    1     V4 region 16S ampli…    NA 10056       16.6
unique(gardAllVarDF$Var)  
##  [1] G2: Clades 1/2                  G1: Clades 3/4                 
##  [3] G3: indetereminant              G4: Clade 5                    
##  [5] G5: Clade 6                     Clade 1                        
##  [7] Clade 2                         Clade 4                        
##  [9] Clade 5                         Clade 3                        
## [11] Clade 6                         Genomospecies 1, G. vaginalis  
## [13] Genomospecies 2                 Genomospecies 3                
## [15] Genomospecies 4, G. piotti      Genomospecies 5, G. leopoldii  
## [17] Genomospecies 12                Genomospecies 10               
## [19] Genomospecies 11                Genomospecies 13               
## [21] Genomospecies 14                Genomospecies 6, G. swidsinskii
## [23] Genomospecies 7                 Genomospecies 8                
## [25] Genomospecies 9                
## 25 Levels: Clade 1 Clade 2 Clade 3 Clade 4 Clade 5 ... G5: Clade 6

View assignments

First pass assignmentby mapping reads to variants without imposing any read count thresholds.

#fig.width=5, fig.height=4
gardAllVarDF %>%
  ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
  geom_point() +
  ggplotAll +
  facet_wrap(~SubjectID, nrow=2) +
  labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
  guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
  xlim(0,40)

Shotgun variants only with N reads on y-axis

gardAllVarDF %>%
  filter(str_detect(Method, "Shotgun")) %>%
  ggplot(aes(x=GWcoll, y=n, shape=Method, size=Method, color=Var)) +
  geom_point() +
  ggplotShotgunOnly +
  scale_y_log10() +
  facet_wrap(~SubjectID, nrow=2) +
  labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
  guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
  xlim(0,40)

Test Thresholds

Determine how many reads must map to each variant to count the variant as present. View variants over ranges of thresholds:

#fig.height=6, fig.width=9
# create data frames
thresholds <- c(1:49) %>% # set the threshold range
  map(~filter(gardAllVarDF, n>.x)) %>% # filter out clades below read threshold 
  map(~group_by(.x, SampleID, Method)) %>%
  map(~mutate(.x, propGard=propGard/sum(propGard))) %>% # recalculate proportions after threshold filtering
  map(ungroup) %>%
  map2(c(1:49), ~mutate(.x, Threshold=.y)) %>% # add threshold column
  purrr::reduce(full_join, by=c(colnames(gardAllVarDF), "Threshold")) # reduce to one data frame

# add threshold information to data frame
gardAllVarDFThresholds <- gardAllVarDF %>%
  mutate(Threshold=0) %>% # set zero threshold for baseline
  rbind(thresholds) %>%
  mutate(Threshold=Threshold+1) # make threshold as >=n 

# plot - thresholds 1 to 50
gardAllVarDFThresholds %>%
  ggplot(aes(x=Threshold, y=propGard, shape=Method, size=Method, color=Var)) + 
  geom_point() +
  ggplotAll +
  ylim(0,1) +
  scale_x_continuous(breaks = seq(0,50, 5)) +
  facet_wrap(~SampleID) +
  #guides(color=guide_legend(override.aes=list(shape=c(rep(16, 6), rep(5, 5)), size=c(rep(2, 6), rep(4, 5))))) +
  labs(x="Threshold (reads ≥ n)", y="Proportion of Gardnerella")

# plot - thresholds 1 to 5
gardAllVarDFThresholds %>%
  filter(Threshold<=5) %>%
  ggplot(aes(x=Threshold, y=propGard, shape=Method, size=Method, color=Var)) + 
  geom_point() +
  ggplotAll +
  ylim(0,1) +
  scale_x_continuous(breaks = seq(0,5, 1)) +
  facet_wrap(~SampleID) +
  #guides(color=guide_legend(override.aes=list(shape=c(rep(16, 6), rep(5, 5)), size=c(rep(2, 6), rep(4, 5))))) +
  labs(x="Threshold (reads ≥ n)", y="Proportion of Gardnerella")

View count of variants over ranges of thresholds

#fig.height=5, fig.width=6
V4Vars <- gardAllVarDF %>%
  filter(Method=="V4 region 16S amplicon") %>%
  select(SampleID, Var, propGard) %>%
  group_by(SampleID) %>%
  summarise(V4vars=n())

sum(V4Vars$V4vars)
## [1] 56
varCounts <- gardAllVarDFThresholds %>%
  filter(Method!="V4 region 16S amplicon") %>%
  select(SampleID, Method, Var, propGard, Threshold) %>%
  group_by(SampleID, Method, Threshold) %>%
  summarise(nVars=n()) %>%
  spread(Method, nVars) %>%
  ungroup() %>%
  full_join(map(1:50, ~mutate(V4Vars, Threshold=.x)) %>%
            purrr::reduce(full_join, by = c("SampleID", "V4vars", "Threshold")), 
            by=c("SampleID", "Threshold")) %>%
  replace_na(list(shotgunVars=0)) %>%
  gather("Method", "nVars", c(`Shotgun Clade`, `Shotgun Genomospecies`, `V4vars`))

# Thresholds 1:50
varCounts %>%
ggplot(aes(x=Threshold, y=nVars, group=Method, color=Method, linetype=Method)) +
  geom_line() +
  facet_wrap(~SampleID, scales="free_y") +
  scale_color_manual(values=c("#56B4E9", "#009E73", "#D55E00"), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
  scale_color_manual(values=c("#56B4E9", "#009E73", "#D55E00"), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
  labs(x="Threshold (reads≥n)", y="N Variants", color="Method")

varCounts %>%
ggplot(aes(x=Threshold, y=nVars, group=Method, color=Method, linetype=Method)) +
  geom_line() +
  facet_wrap(~SampleID, scales = "free_y") +
  xlim(1,5) +
 scale_linetype_manual(values=c(1,2,3), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
  scale_color_manual(values=c("#56B4E9", "#009E73", "#D55E00"), labels=c("Shotgun Clade", "Shotgun Genomospecies", "V4 variants")) +
  labs(x="Threshold (reads≥n)", y="N Variants", color="Method")

Sensitivity and Specificity testing

Determine the how well variants correspond to each other.

# Samples we are testing
testSamples <- gardAllVarDF %>%
  .$SampleID %>%
  unique

# Number of V4 variants in these samples
nV4 <- gardAllVarDF %>% # number of v4 variants
  filter(Method=="V4 region 16S amplicon") %>%
  select(SampleID, Var) %>%
  nrow

 # Number of Clades found in all samples at each threshold up to 5
nClades <- gardAllVarDFThresholds %>%
  filter(Method=="Shotgun Clade",
         Threshold <= 5) %>%
  select(SampleID, Var, Threshold) %>%
  group_by(Threshold) %>%
  summarise(Total=n())

#  Number of Genomospecies found in all samples at each threshold up to 5
nGenomos <- gardAllVarDFThresholds %>%
  filter(Method=="Shotgun Genomospecies",
         Threshold <= 5) %>%
  select(SampleID, Var, Threshold) %>%
  group_by(Threshold) %>%
  summarise(Total=n())

Sensitivity

Sensitivity as percent of V4 variants or clades that are not represented by a clade or genomospecies

# define function to determine number of V4 variants or clades that are not represented by a clade or genomospecies
testSens <- function(testSample, comparison, thresh) {
 if(str_detect(comparison, "V4")){
   a <- gardAllVarDF %>%
      filter(SampleID == testSample,
             Method=="V4 region 16S amplicon") %>%
      dplyr::rename(V4=Method) %>%
      select(Var, V4)
  
  if("G3: indetereminant" %in% a$Var){
    a <- a %>%
      bind_rows(data.frame(Var=c("G2: Clades 1/2", "G1: Clades 3/4"), V4=c("V4 region 16S amplicon", "V4 region 16S amplicon"))) %>%
      unique %>%
      filter(Var!="G3: indetereminant")
  }
  
  if(comparison=="V4_clade"){
     b <- gardAllVarDFThresholds %>%
      filter(SampleID == testSample,
             Threshold == thresh,
             Method=="Shotgun Clade") %>%
      mutate(Var=case_when(Var=="Clade 1"~"G2: Clades 1/2", 
                          Var=="Clade 2"~"G2: Clades 1/2",
                          Var=="Clade 3"~"G1: Clades 3/4",
                          Var=="Clade 4"~"G1: Clades 3/4",
                          Var=="Clade 5"~"G4: Clade 5",
                          Var=="Clade 6"~"G5: Clade 6"),
            Shotgun=Method) %>%
      select(Var, Shotgun) %>%
      unique 
  }
  
  if(comparison=="V4_genomospecies"){
     b <- gardAllVarDFThresholds %>%
      filter(SampleID == testSample,
             Threshold == thresh,
             Method=="Shotgun Genomospecies") %>%
      mutate(Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"G2: Clades 1/2", 
                          Var=="Genomospecies 2"~"G2: Clades 1/2",
                          Var=="Genomospecies 3"~"G2: Clades 1/2",
                          Var=="Genomospecies 4, G. piotti"~"G2: Clades 1/2",
                          Var=="Genomospecies 11"~"G2: Clades 1/2",
                          Var=="Genomospecies 5, G. leopoldii"~"G1: Clades 3/4",
                          Var=="Genomospecies 6, G. swidsinskii"~"G1: Clades 3/4",
                          Var=="Genomospecies 7"~"G1: Clades 3/4",
                          Var=="Genomospecies 8"~"G1: Clades 3/4",
                          Var=="Genomospecies 9"~"G1: Clades 3/4",
                          Var=="Genomospecies 10"~"G1: Clades 3/4",
                          Var=="Genomospecies 14"~"G1: Clades 3/4",
                          Var=="Genomospecies 12"~"G4: Clade 5",
                          Var=="Genomospecies 13"~"G5: Clade 6"),
            Shotgun=Method) %>%
      select(Var, Shotgun) %>%
      unique}
 
  c <- a %>%
      full_join(b, by="Var") %>%
      group_by(Shotgun) %>%
      tally %>%
      filter(is.na(Shotgun)) %>%
      .$n
  }
  
  if(comparison=="clade_genomospecies"){
    a <- gardAllVarDFThresholds %>%
      filter(SampleID == testSample,
             Threshold == thresh,
             Method =="Shotgun Clade") %>%
      dplyr::rename(Clade=Method) %>%
      select(Var, Clade)
     
    b <- gardAllVarDFThresholds %>%
      filter(SampleID == testSample,
             Threshold == thresh,
             Method=="Shotgun Genomospecies") %>%
      mutate(Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"Clade 1", 
                          Var=="Genomospecies 2"~"Clade 1",
                          Var=="Genomospecies 3"~"Clade 2",
                          Var=="Genomospecies 4, G. piotti"~"Clade 2",
                          Var=="Genomospecies 11"~"Clade 2",
                          Var=="Genomospecies 5, G. leopoldii"~"Clade 4",
                          Var=="Genomospecies 6, G. swidsinskii"~"Clade 4",
                          Var=="Genomospecies 7"~"Clade 3",
                          Var=="Genomospecies 8"~"Clade 3",
                          Var=="Genomospecies 9"~"Clade 3",
                          Var=="Genomospecies 10"~"Clade 3",
                          Var=="Genomospecies 14"~"Clade 3",
                          Var=="Genomospecies 12"~"Clade 5",
                          Var=="Genomospecies 13"~"Clade 6"),
            Genomospecies=Method) %>%
      select(Var, Genomospecies) %>%
      unique  
    
   c <- a %>%
    full_join(b, by="Var") %>%
    group_by(Genomospecies) %>%
    tally %>%
    filter(is.na(Genomospecies)) %>%
    .$n
   }
 return(c)
}

V4 vs. Clade

data.frame(testSample=rep(testSamples, 5),# set variables for testSens function
           comparison=rep("V4_clade", 135),
           thresh=rep(c(1:5), 27)) %>%
  bind_cols(z=pmap(., testSens) %>%  #run testSens to find number of missing variables for each test
              modify_if(~length(.)==0, ~0) %>%
              flatten_dbl) %>%
  group_by(thresh) %>%
  summarize(nMissing=sum(z)) %>%  # count N missing at each threshold
  dplyr::rename(Threshold=thresh) %>%  # format results table
  mutate(Total = nV4,
         `Percent Missing`=round(nMissing/Total*100, 1)) %>%
  formattable
Threshold nMissing Total Percent Missing
1 5 56 8.9
2 5 56 8.9
3 5 56 8.9
4 6 56 10.7
5 6 56 10.7

V4 vs. genomospecies

data.frame(testSample=rep(testSamples, 5),  # set variables for testSens function
             comparison=rep("V4_genomospecies", 135),
             thresh=rep(c(1:5), 27)) %>%
  bind_cols(z=pmap(., testSens) %>%  # run testSens to find number of missing variants to each test
              modify_if(~length(.)==0, ~0) %>%
              flatten_dbl) %>%
  group_by(thresh) %>%
  summarize(nMissing=sum(z)) %>% # count N missing at each threshold
  dplyr::rename(Threshold=thresh) %>%  # format results table
  mutate(Total=nV4,  
    `Percent Missing`=round(nMissing/Total*100, 1)) %>%
  formattable
Threshold nMissing Total Percent Missing
1 5 56 8.9
2 5 56 8.9
3 7 56 12.5
4 7 56 12.5
5 8 56 14.3

Clade vs. genomospecies

data.frame(testSample=rep(testSamples, 5), # set variables for testSens function
             comparison=rep("clade_genomospecies", 135),
             thresh=rep(c(1:5), 27)) %>%
  bind_cols(z=pmap(., testSens) %>% # run testSens to find number of missing variants to each test
              modify_if(~length(.)==0, ~0) %>%
              flatten_dbl) %>%
  group_by(thresh) %>%
  summarize(nMissing=sum(z)) %>% # count N missing at each threshold
  dplyr::rename(Threshold=thresh) %>% # format results table
  left_join(nClades, by="Threshold") %>%
  mutate(`Percent Missing`=round(nMissing/Total*100, 1)) %>%
  formattable
Threshold nMissing Total Percent Missing
1 8 76 10.5
2 11 72 15.3
3 11 69 15.9
4 10 67 14.9
5 13 67 19.4

Precision

Precision here defined as number of clades or genomospecies that are not supported by a V4 variant

# define function to determine the number of clades or genomospecies that are not supported by a V4 variant
testSpec <- function(testSample, comparison, thresh) {
  a <- gardAllVarDF %>%
    filter(SampleID == testSample,
           Method=="V4 region 16S amplicon") %>%
    dplyr::rename(V4=Method) %>%
    select(Var, V4)

  if("G3: indetereminant" %in% a$Var){
    a <- a %>%
      bind_rows(data.frame(Var=c("G2: Clades 1/2", "G1: Clades 3/4"), V4=c("V4 region 16S amplicon", "V4 region 16S amplicon"))) %>%
      unique %>%
      filter(Var!="G3: indetereminant")
  }
  
  if(comparison=="V4_clade"){
     b <- gardAllVarDFThresholds %>%
      filter(SampleID == testSample,
             Threshold == thresh,
             Method=="Shotgun Clade") %>%
      mutate(Var=case_when(Var=="Clade 1"~"G2: Clades 1/2", 
                          Var=="Clade 2"~"G2: Clades 1/2",
                          Var=="Clade 3"~"G1: Clades 3/4",
                          Var=="Clade 4"~"G1: Clades 3/4",
                          Var=="Clade 5"~"G4: Clade 5",
                          Var=="Clade 6"~"G5: Clade 6"),
            Shotgun=Method) %>%
      select(Var, Shotgun) %>%
      unique 
  }
  
  if(comparison=="V4_genomospecies"){
     b <- gardAllVarDFThresholds %>%
      filter(SampleID == testSample,
             Threshold == thresh,
             Method=="Shotgun Genomospecies") %>%
      mutate(Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"G2: Clades 1/2", 
                          Var=="Genomospecies 2"~"G2: Clades 1/2",
                          Var=="Genomospecies 3"~"G2: Clades 1/2",
                          Var=="Genomospecies 4, G. piotti"~"G2: Clades 1/2",
                          Var=="Genomospecies 11"~"G2: Clades 1/2",
                          Var=="Genomospecies 5, G. leopoldii"~"G1: Clades 3/4",
                          Var=="Genomospecies 6, G. swidsinskii"~"G1: Clades 3/4",
                          Var=="Genomospecies 7"~"G1: Clades 3/4",
                          Var=="Genomospecies 8"~"G1: Clades 3/4",
                          Var=="Genomospecies 9"~"G1: Clades 3/4",
                          Var=="Genomospecies 10"~"G1: Clades 3/4",
                          Var=="Genomospecies 14"~"G1: Clades 3/4",
                          Var=="Genomospecies 12"~"G4: Clade 5",
                          Var=="Genomospecies 13"~"G5: Clade 6"),
            Shotgun=Method) %>%
      select(Var, Shotgun) %>%
      unique
     }
  
   c <- a %>%
    full_join(b, by="Var") %>%
    group_by(V4) %>%
    tally %>%
    filter(is.na(V4)) %>%
    .$n
   
return(c)}

V4 vs. Clade

data.frame(testSample=rep(testSamples, 5),  # set variables for testSpec function
             comparison=rep("V4_clade", 135),
             thresh=rep(c(1:5), 27)) %>%
  bind_cols(z=pmap(., testSpec) %>%  # run testSpec to find number of missing variants to each test
              modify_if(~length(.)==0, ~0) %>%
              flatten_dbl) %>%
  group_by(thresh) %>%
  summarize(nUnsupported=sum(z)) %>%  # count N unsupported variants at each threshold
  dplyr::rename(Threshold=thresh) %>% # format results table
  left_join(nClades, by="Threshold") %>%
  mutate(`Percent Unsupported`=round(nUnsupported/Total*100, 1)) %>%
  formattable
Threshold nUnsupported Total Percent Unsupported
1 4 76 5.3
2 2 72 2.8
3 2 69 2.9
4 2 67 3.0
5 2 67 3.0

V4 vs. Genomospecies

data.frame(testSample=rep(testSamples, 5),  # set variables for testSpec function
             comparison=rep("V4_genomospecies", 135),
             thresh=rep(c(1:5), 27)) %>%
  bind_cols(z=pmap(., testSpec) %>%  # run testSpec to find number of missing variants at each test
              modify_if(~length(.)==0, ~0) %>%
              flatten_dbl) %>%
  group_by(thresh) %>%
  summarize(nUnsupported=sum(z)) %>%  # count N unsupported variants at each threshold
  dplyr::rename(Threshold=thresh) %>% # format results table
  left_join(nGenomos, by="Threshold") %>%
  mutate(`Percent Unsupported`=round(nUnsupported/Total*100, 1)) %>%
  formattable
Threshold nUnsupported Total Percent Unsupported
1 4 123 3.3
2 2 107 1.9
3 2 101 2.0
4 1 98 1.0
5 1 91 1.1

Choose a threshold of ≥2 reads for present. Ignore all one-read assignments

gardAllVarDF_withThreshold <- gardAllVarDFThresholds %>%
  filter(str_detect(Method, "Shotgun")&Threshold==2|str_detect(Method, "V4")&Threshold==1) %>%
  select(-Threshold)

View Variants in Samples

Descriptives

V4 variants found

gardAllVarDF_withThreshold %>%
  filter(Method=="V4 region 16S amplicon") %>%
  count(SampleID) %>%
  summarize(min=min(n), max=max(n), mean=mean(n), total=sum(n))
## # A tibble: 1 × 4
##     min   max  mean total
##   <int> <int> <dbl> <int>
## 1     1     5  2.15    56

Clades found

gardAllVarDF_withThreshold %>%
  filter(Method=="Shotgun Clade") %>%
  count(SampleID) %>%
  summarize(min=min(n), max=max(n), mean=mean(n), total=sum(n))
## # A tibble: 1 × 4
##     min   max  mean total
##   <int> <int> <dbl> <int>
## 1     1     6  3.13    72

Genomospecies found

gardAllVarDF_withThreshold %>%
  filter(Method=="Shotgun Genomospecies") %>%
  count(SampleID) %>%
  summarize(min=min(n), max=max(n), mean=mean(n), total=sum(n))
## # A tibble: 1 × 4
##     min   max  mean total
##   <int> <int> <dbl> <int>
## 1     1    13  4.65   107

All variants

gardAllVarDF_withThreshold %>%
ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
  geom_point() +
  ggplotAll +
  facet_wrap(~SubjectID, nrow=2) +
  labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
  guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
  xlim(0,40)

Clades vs. 16S

gardAllVarDF_withThreshold %>%
  filter(Method!="Shotgun Genomospecies") %>%
ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
  geom_point() +
  ggplotCladeV4 +
  facet_wrap(~SubjectID, nrow=2) +
  labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
  guides(color=guide_legend(override.aes=list(shape=15, size=3))) +
  xlim(0,40)

Shotgun methods only

gardAllVarDF_withThreshold %>%
  filter(Method!="V4 region 16S amplicon") %>%
  ggplot(aes(x=GWcoll, y=propGard, shape=Method, size=Method, color=Var)) +
  geom_point() +
  ggplotShotgunOnly +
  facet_wrap(~SubjectID, nrow=2) +
  labs(x="Gestation weeks at collection", y="Proportion of Gardnerella", shape="Method", color="Variant") +
  guides(color=guide_legend(override.aes=list(shape=15, size=3), ncol=2)) +
  xlim(0,40)

Comparing methods alternate figures

Clade vs. 16S

# samples with variant G3
g3Samples <- gardAllVarDF_withThreshold %>%
  filter(Var=="G3: indetereminant") %>%
  .$SampleID

G3_50plus <- gardAllVarDF_withThreshold %>%
  filter(Var=="G3: indetereminant") %>%
  mutate(propGard=round(propGard, 2)) %>%
  filter(propGard>=.50)
G3_50plus
## # A tibble: 2 × 7
##   SampleID   Var                propGard Method               n SubjectID GWcoll
##   <chr>      <fct>                 <dbl> <chr>            <dbl> <chr>      <dbl>
## 1 1062701128 G3: indetereminant     0.65 V4 region 16S a…    NA 10627       12  
## 2 2050501168 G3: indetereminant     0.5  V4 region 16S a…    NA 20505       16.4
Sample 1062701128 has 65% G3, and sample 2050501168 has 50% G3.
cladeVs16SPlot <- gardAllVarDF_withThreshold %>%
  filter(Method!="Shotgun Genomospecies") %>%
  mutate(Var=as.character(Var),
         Var=case_when(Var=="Clade 1"~"G2: Clades 1/2",
                   Var=="Clade 2"~"G2: Clades 1/2",
                   Var=="Clade 3"~"G1: Clades 3/4",
                   Var=="Clade 4"~"G1: Clades 3/4",
                   Var=="Clade 5"~"G4: Clade 5",
                   Var=="Clade 6"~"G5: Clade 6",
                   str_detect(Var, "G")~Var)) %>%
  group_by(SampleID, Method, Var) %>%
  summarise(propGard=sum(propGard)) %>%
  ungroup %>%
  spread(Method, propGard) %>%
  replace_na(list(`V4 region 16S amplicon`=0, 
                  `Shotgun Clade`=0)) %>%
  mutate(`≥50% G3`=SampleID %in% G3_50plus$SampleID) %>%
  filter(Var!="G3: indetereminant") %>%
  ggplot(aes(x=`V4 region 16S amplicon`, y=`Shotgun Clade`)) +
  geom_abline(aes(intercept=0, slope=1), color="gray", linetype=2) +
  geom_smooth(method="lm", se=FALSE, color="black", linetype=1) +
  stat_cor(method = "pearson", alternative = "two.sided", cor.coef.name = c("R"), p.digits = 3, label.y = 0.875, label.x = 0) +
  geom_point(aes(color=Var, shape=`≥50% G3`), alpha=0.6, size=2) +
  scale_color_manual(values=c("#b22222", "#214fc6", "#E69F00", "#009E73")) +
  theme(legend.position = c(0.6, 0.25),
        legend.direction = "vertical",
        legend.text = element_text(size=7),
        legend.title = element_text(size=8),
        legend.background = element_blank()) +
  guides(shape="none") +
  labs(color="V4 Variant")
cladeVs16SPlot

Genomospecies vs. 16S

gsVs16SPlot <- gardAllVarDF_withThreshold %>%
  filter(Method!="Shotgun Clade") %>%
  mutate(Var=as.character(Var),
         Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"G2: Clades 1/2",
                       Var=="Genomospecies 2"~"G2: Clades 1/2",
                       Var=="Genomospecies 3"~"G2: Clades 1/2",
                       Var=="Genomospecies 4, G. piotti"~"G2: Clades 1/2",
                       Var=="Genomospecies 11"~"G2: Clades 1/2",
                       Var=="Genomospecies 5, G. leopoldii"~"G1: Clades 3/4",
                       Var=="Genomospecies 6, G. swidsinskii"~"G1: Clades 3/4",
                       Var=="Genomospecies 7"~"G1: Clades 3/4",
                       Var=="Genomospecies 8"~"G1: Clades 3/4",
                       Var=="Genomospecies 9"~"G1: Clades 3/4",
                       Var=="Genomospecies 10"~"G1: Clades 3/4",
                       Var=="Genomospecies 14"~"G1: Clades 3/4",
                       Var=="Genomospecies 12"~"G4: Clade 5",
                       Var=="Genomospecies 13"~"G5: Clade 6",
                       str_detect(Var, "G")~Var),
         Var=case_when(Var=="G2: Clades 1/2"~"G2: Genomospeces 1/2/3/4/11",
                       Var=="G1: Clades 3/4"~"G1: Genomospeces 5/6/7/8/9/10/14",
                       Var=="G4: Clade 5"~"G4: Genomospecies 12",
                       Var=="G5: Clade 6"~"G5: Genomospecies 13")) %>%
  group_by(SampleID, Method, Var) %>%
  summarise(propGard=sum(propGard)) %>%
  ungroup %>%
  spread(Method, propGard) %>%
  replace_na(list(`V4 region 16S amplicon`=0, 
                  `Shotgun Genomospecies`=0)) %>%
  mutate(`≥50% G3`=SampleID %in% G3_50plus$SampleID) %>%
  filter(Var!="G3: indetereminant") %>%
  ggplot(aes(x=`V4 region 16S amplicon`, y=`Shotgun Genomospecies`)) +
  geom_abline(aes(intercept=0, slope=1), color="gray", linetype=2) +
  geom_smooth(method="lm", se=FALSE, color="black", linetype=1) +
  stat_cor(method = "pearson", alternative = "two.sided", cor.coef.name = c("R"), p.digits = 3, label.y = 0.875, label.x = 0) +
  geom_point(aes(color=Var, shape=`≥50% G3`), alpha=0.6, size=2) +
  scale_color_manual(values=c("#b22222", "#214fc6", "#E69F00", "#009E73")) +
  theme(legend.position = c(0.675, 0.25),
        legend.direction = "vertical",
        legend.text = element_text(size=7),
        legend.title = element_text(size=8),
        legend.background = element_blank()) +
  guides(shape="none") +
  labs(color="V4 Variant")
gsVs16SPlot

Clades vs. Genomospecies

gardAllVarDF_withThreshold %>%
  filter(Method=="Shotgun Clade") %>%
  .$Var %>%
  unique
## [1] Clade 1 Clade 2 Clade 4 Clade 3 Clade 5 Clade 6
## 25 Levels: Clade 1 Clade 2 Clade 3 Clade 4 Clade 5 ... G5: Clade 6
cladeVsGsPlot <- gardAllVarDF_withThreshold %>%
  filter(Method!="V4 region 16S amplicon") %>%
  mutate(Var=as.character(Var),
         Var=case_when(Var=="Genomospecies 1, G. vaginalis"~"Clade 1",
                       Var=="Genomospecies 2"~"Clade 1",
                       Var=="Genomospecies 3"~"Clade 2",
                       Var=="Genomospecies 4, G. piotti"~"Clade 2",
                       Var=="Genomospecies 11"~"Clade 2",
                       Var=="Genomospecies 5, G. leopoldii"~"Clade 4",
                       Var=="Genomospecies 6, G. swidsinskii"~"Clade 4",
                       Var=="Genomospecies 7"~"Clade 3",
                       Var=="Genomospecies 8"~"Clade 3",
                       Var=="Genomospecies 9"~"Clade 3",
                       Var=="Genomospecies 10"~"Clade 3",
                       Var=="Genomospecies 14"~"Clade 3",
                       Var=="Genomospecies 12"~"Clade 5",
                       Var=="Genomospecies 13"~"Clade 6",
                       str_detect(Var, "Clade")~Var),
         Var=case_when(Var=="Clade 1"~"Clade 1: Genomospeces 1/2",
                       Var=="Clade 2"~"Clade 2: Genomospeces 3/4/11",
                       Var=="Clade 3"~"Clade 3: Genomospecies 7/8/9/10/14",
                       Var=="Clade 4"~"Clade 4: Genomospecies 5/6",
                       Var=="Clade 5"~"Clade 5: Genomospecies 12",
                       Var=="Clade 6"~"Clade 6: Genomospecies 13")) %>%
  group_by(SampleID, Method, Var) %>%
  summarise(propGard=sum(propGard)) %>%
  ungroup %>%
  spread(Method, propGard) %>%
  replace_na(list(`Shotgun Clade`=0, 
                  `Shotgun Genomospecies`=0)) %>%
  ggplot(aes(x=`Shotgun Clade`, y=`Shotgun Genomospecies`)) +
  geom_abline(aes(intercept=0, slope=1), color="gray", linetype=2) +
  geom_smooth(method="lm", se=FALSE, color="black", linetype=1) +
  stat_cor(method = "pearson", alternative = "two.sided", cor.coef.name = c("R"), p.digits = 3, label.y = 0.875, label.x = 0) +
  geom_point(aes(color=Var), alpha=0.6, size=2) +
  scale_color_manual(values=c("#CC79A7", "#D55E00", "#56B4E9", "#0072B2", "#E69F00", "#009E73")) +
  theme(legend.position = c(0.71, 0.22),
        legend.direction = "vertical",
        legend.text = element_text(size=7),
        legend.title = element_text(size=8),
        legend.background = element_blank()) +
  labs(color="Clade")
cladeVsGsPlot

Figure 2ABC:

#fig.width=6, fig.height=2
plot_grid(plotlist=list(cladeVs16SPlot, gsVs16SPlot, cladeVsGsPlot), nrow = 1, labels=c("A", "B", "C"), label_size = 15)

#ggsave(filename = file.path(figureOut, paste(Sys.Date(), "figure2ABC_mappingvs16S.png", sep="_")))

Samples where we lose all Gardnerella

In subject 10608: No Gardnerella was detected. Shotgun sequencing can demonstrate less sensitivity than amplicon sequencing.

Subject 10608

#gardnerella found by metaphlan in subject 10608
gardAllVarDF %>%
  filter(SubjectID==10608) %>%
  select(SampleID, Var, propGard, Method)
## # A tibble: 3 × 4
##   SampleID   Var            propGard Method                
##   <chr>      <fct>             <dbl> <chr>                 
## 1 1060801198 G1: Clades 3/4    0.713 V4 region 16S amplicon
## 2 1060801198 G4: Clade 5       0.287 V4 region 16S amplicon
## 3 1060801338 G1: Clades 3/4    1     V4 region 16S amplicon
samples10608 <- gardAllVarDF %>%
  filter(SubjectID==10608) %>%
  .$SampleID %>%
  unique
samples10608
## [1] "1060801198" "1060801338"
#percent Gardnerella in these samples
mDF %>%
  select(SampleID, Gardnerella_vaginalis) %>%
  filter(SampleID %in% samples10608)
## # A tibble: 2 × 2
##   SampleID   Gardnerella_vaginalis
##   <chr>                      <dbl>
## 1 1060801198                     0
## 2 1060801338                     0
#read depth of these samples
sgDF %>%
  filter(SampleID %in% samples10608) %>%
  select(SampleID, TotalPairs, bbmapFiltered_pairs, pctHuman)
## # A tibble: 2 × 4
##   SampleID   TotalPairs bbmapFiltered_pairs pctHuman
##   <chr>           <dbl>               <dbl>    <dbl>
## 1 1060801198   14655421              258847     98.0
## 2 1060801338   15223063              195118     98.5
ggplot(data=sgDF) +
  geom_density(aes(x=TotalPairs)) +
  geom_vline(xintercept = sgDF$TotalPairs[sgDF$SampleID==samples10608[1]], color="red") +
  geom_vline(xintercept = sgDF$TotalPairs[sgDF$SampleID==samples10608[2]], color="blue")

ggplot(data=sgDF) +
  geom_density(aes(x=bbmapFiltered_pairs)) +
  geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID==samples10608[1]], color="red") +
  geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID==samples10608[2]], color="blue")

Sample 1050801318

#No gardnerella found by metaphlan in sample 1050801318
gardAllVarDF %>%
  filter(SubjectID=="10508") %>%
  select(SampleID, Var, propGard, Method)
## # A tibble: 8 × 4
##   SampleID   Var                           propGard Method                
##   <chr>      <fct>                            <dbl> <chr>                 
## 1 1050801308 G1: Clades 3/4                   0.799 V4 region 16S amplicon
## 2 1050801308 G3: indetereminant               0.201 V4 region 16S amplicon
## 3 1050801318 G2: Clades 1/2                   1     V4 region 16S amplicon
## 4 1050801308 Clade 2                          0.231 Shotgun Clade         
## 5 1050801308 Clade 4                          0.769 Shotgun Clade         
## 6 1050801308 Genomospecies 3                  0.286 Shotgun Genomospecies 
## 7 1050801308 Genomospecies 4, G. piotti       0.143 Shotgun Genomospecies 
## 8 1050801308 Genomospecies 5, G. leopoldii    0.571 Shotgun Genomospecies
#read depth of all sub samples
sgDF %>%
  filter(SampleID=="1050801318") %>%
  select(SampleID, TotalPairs, bbmapFiltered_pairs, pctHuman)
## # A tibble: 1 × 4
##   SampleID   TotalPairs bbmapFiltered_pairs pctHuman
##   <chr>           <dbl>               <dbl>    <dbl>
## 1 1050801318   18029410              494084     96.8
#Bacterial reads in all subject 10508 samples
ggplot(data=sgDF) +
  geom_density(aes(x=bbmapFiltered_pairs)) +
  geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID=="1050801318"], color="blue", show.legend = TRUE) +
  geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID=="1050801308"], color="red", show.legend = TRUE) +
  geom_vline(xintercept = sgDF$bbmapFiltered_pairs[sgDF$SampleID=="1050835178"], color="green", show.legend = TRUE)

Compare to Gardnerella found by MetaPhlAn2

Gardnerella in MetaPhlAn2 but not this method

Assess uncharacterized Gardnerella when measuring clades:

sgCladeDF %>%
  filter(n>=2) %>%
  group_by(SampleID) %>%
  summarise(n=sum(n)) %>%
  full_join(mDF[,c("SampleID", "Gardnerella_vaginalis")], by="SampleID") %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 0 × 3
## # … with 3 variables: SampleID <chr>, n <dbl>, Gardnerella_vaginalis <dbl>

No uncharacterized Gardnerella when measuring clades.

Assess uncharacterized Gardnerella when measuring genomospecies:

sgGenomoDF %>%
  filter(n>=2) %>%
  group_by(SampleID) %>%
  summarise(n=sum(n)) %>%
  full_join(mDF[,c("SampleID", "Gardnerella_vaginalis")], by="SampleID") %>%
  filter(is.na(n), Gardnerella_vaginalis>0)
## # A tibble: 1 × 3
##   SampleID       n Gardnerella_vaginalis
##   <chr>      <dbl>                 <dbl>
## 1 1061235278    NA                0.0152

One sample with uncharacterized Gardnerella by genomospecies.

Session Info

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] cowplot_1.1.1       ggpubr_0.4.0        formattable_0.2.1  
##  [4] Biostrings_2.60.2   GenomeInfoDb_1.28.4 XVector_0.32.0     
##  [7] IRanges_2.26.0      S4Vectors_0.30.2    BiocGenerics_0.38.0
## [10] phyloseq_1.36.0     forcats_0.5.1       stringr_1.4.0      
## [13] dplyr_1.0.7         purrr_0.3.4         readr_2.0.2        
## [16] tidyr_1.1.4         tibble_3.1.5        ggplot2_3.3.5      
## [19] tidyverse_1.3.1    
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-2       ggsignif_0.6.3         ellipsis_0.3.2        
##  [4] fs_1.5.0               rstudioapi_0.13        farver_2.1.0          
##  [7] bit64_4.0.5            fansi_0.5.0            lubridate_1.8.0       
## [10] xml2_1.3.2             codetools_0.2-18       splines_4.1.1         
## [13] knitr_1.36             ade4_1.7-18            jsonlite_1.7.2        
## [16] broom_0.7.9            cluster_2.1.2          dbplyr_2.1.1          
## [19] compiler_4.1.1         httr_1.4.2             backports_1.2.1       
## [22] assertthat_0.2.1       Matrix_1.3-4           fastmap_1.1.0         
## [25] cli_3.4.1              htmltools_0.5.2        tools_4.1.1           
## [28] igraph_1.2.6           gtable_0.3.0           glue_1.4.2            
## [31] GenomeInfoDbData_1.2.6 reshape2_1.4.4         Rcpp_1.0.7            
## [34] carData_3.0-5          Biobase_2.52.0         cellranger_1.1.0      
## [37] jquerylib_0.1.4        vctrs_0.5.1            rhdf5filters_1.4.0    
## [40] multtest_2.48.0        ape_5.5                nlme_3.1-153          
## [43] iterators_1.0.13       xfun_0.26              rvest_1.0.1           
## [46] lifecycle_1.0.3        rstatix_0.7.0          zlibbioc_1.38.0       
## [49] MASS_7.3-54            scales_1.1.1           vroom_1.5.5           
## [52] hms_1.1.1              biomformat_1.20.0      rhdf5_2.36.0          
## [55] yaml_2.2.1             sass_0.4.0             stringi_1.7.8         
## [58] highr_0.9              foreach_1.5.1          permute_0.9-5         
## [61] rlang_1.0.6            pkgconfig_2.0.3        bitops_1.0-7          
## [64] evaluate_0.14          lattice_0.20-45        Rhdf5lib_1.14.2       
## [67] htmlwidgets_1.5.4      labeling_0.4.2         bit_4.0.4             
## [70] tidyselect_1.1.1       plyr_1.8.6             magrittr_2.0.1        
## [73] R6_2.5.1               generics_0.1.0         DBI_1.1.1             
## [76] pillar_1.6.3           haven_2.4.3            withr_2.4.2           
## [79] mgcv_1.8-38            survival_3.2-13        abind_1.4-5           
## [82] RCurl_1.98-1.5         modelr_0.1.8           crayon_1.4.1          
## [85] car_3.0-12             utf8_1.2.2             tzdb_0.1.2            
## [88] rmarkdown_2.11         grid_4.1.1             readxl_1.3.1          
## [91] data.table_1.14.2      vegan_2.5-7            reprex_2.0.1          
## [94] digest_0.6.28          munsell_0.5.0          bslib_0.3.1